import numpy as np

np.random.seed(5991)
import pandas as pd
import time
import json
import re
# import seaborn as sns
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import pyreadr
import os
import pickle

pio.renderers.default = 'browser'
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

import torch

torch.cuda.is_available()

rfile = pyreadr.read_r('spacyr_sentences.rda')
txts_df1 = rfile['spacy_sentence_df']

new_rfile = pyreadr.read_r('NEW_spacyr_sentences.rda')
txts_df2 = new_rfile['spacy_sentence_df']
txts_df2['doc_id'] = txts_df2['doc_id'] + 165

txts_df = pd.concat([txts_df1, txts_df2], ignore_index=True)
# now there are 238 documents with 28181 total sentences

del txts_df1
del txts_df2

txts_df['date'] = pd.to_datetime(txts_df['date'], format='%Y-%m')
timestamps = txts_df['date'].tolist()

qdocs = txts_df.token.values  # return the sentences into array so BERTopic can use it
sent_model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)
sent_model.max_seq_length = 8192
# save this model because BERTopic needs it when saving and loading the BERTopic model
# sent_model.save(path='Embedding Model GTE BASE V15')

from umap import UMAP

# set as default given in BERTopic doc
umap_model = UMAP(n_neighbors=15, n_components=5, metric='cosine', min_dist=0, random_state=5991)

from sklearn.feature_extraction.text import CountVectorizer


def remove_stop_phrases(doc):
    stop_phrases = ['quantum computer', 'quantum computers', 'quantum computing',
                    'quantum technology', 'quantum technologies']
    for phrase in stop_phrases:
        doc = re.sub(phrase, "", doc, flags=re.IGNORECASE)
    doc = doc.lower()
    return doc


class NewVectorizer(CountVectorizer):
    def _word_ngrams(self, tokens, stop_words=None):

        # handle stop words
        if stop_words is not None:
            tokens = [w for w in tokens if w not in stop_words]

        # handle token n-grams
        min_n, max_n = self.ngram_range
        if max_n != 1:
            original_tokens = tokens
            if min_n == 1:
                # no need to do any slicing for unigrams
                # just iterate through the original tokens
                tokens = list(original_tokens)
                min_n += 1
            else:
                tokens = []

        n_original_tokens = len(original_tokens)

        # bind method outside of loop to reduce overhead
        tokens_append = tokens.append
        space_join = " ".join

        for n in range(min_n, min(max_n + 1, n_original_tokens + 1)):
            for i in range(n_original_tokens - n + 1):
                tokens_append(space_join(original_tokens[i: i + n]))

        # custom function to remove the word 'quantum' from being displayed as keywords
        def check_quantum(token):
            split_token = token.split(' ')
            if any(x in split_token for x in ['quantum']) and len(split_token) == 1:
                return False
            return True

        tokens = list(filter(check_quantum, tokens))
        return tokens


vectorizer_model = NewVectorizer(ngram_range=(1, 3), stop_words="english",
                                 preprocessor=remove_stop_phrases)

import hdbscan

# min_cluster_size: default 5, BERTopic sets at 10, controls min size of cluster and number of clusters generated
# Increasing this value results in fewer clusters but of larger size
# decreasing this value results in more micro clusters being generated
# BERTopic advises to increase this value than decrease it. Keep at 10 if small dataset

# min_samples: default equal to min_cluster_size, higher discards more outliers to increase cluster size
# controls the number of outliers generated.
# too small and too noisy
clusterer = hdbscan.HDBSCAN(min_cluster_size=46, min_samples=7, metric='euclidean', prediction_data=True)

starttime = time.time()
topic_model1 = BERTopic(language='english', umap_model=umap_model, embedding_model=sent_model,
                        hdbscan_model=clusterer, vectorizer_model=vectorizer_model, top_n_words=15,
                        calculate_probabilities=True)
topics1, probs1 = topic_model1.fit_transform(qdocs)
endtime = time.time()  # default takes about 213 sec, custom HDBSCAN takes 114 sec
print("Time elapsed: {}".format(endtime - starttime))

print("Topic -1 occurrence: ", topics1.count(-1))

# saving the first final version of BERTopic model
topic_model1.save(path="BERTopic Model", serialization="safetensors", save_ctfidf=True)
# loading the BERTopic model
sent_model = SentenceTransformer("Embedding Model GTE BASE V15", trust_remote_code=True)
sent_model.max_seq_length = 8192
topic_model1 = BERTopic.load("BERTopic Model", embedding_model=sent_model)

'''
new_docs = ["This is a test document to check if my embedding model works"]
topics, _ = topic_model1.transform(new_docs)
print(f"Assigned topic: {topics[0]}")
'''

# shows probabilities under respective topics per document.
probs_df = pd.DataFrame(probs1)
probs_df['main percentage'] = pd.DataFrame({'max': probs_df.max(axis=1)})

t_topics = topic_model1.get_topics()
t_topics = pd.DataFrame.from_dict(t_topics)
# default gives 281 topics and 50.2% in topic -1
# custom optimized gives 105 topics and 46% in topic -1

t_topics.to_csv(r'final_updated_topics.csv', index=False, header=True)

# save topics
with open("topics1.json", 'w') as f:
    # indent=2 not needed but makes the file human-readable if data is nested
    json.dump(topics1, f, indent=2)
# LOAD topics1 file
with open("topics1.json", 'r') as f:
    topics1 = json.load(f)

# save probs
np.save('probs1.npy', probs1)
# LOAD probs1 file
probs1 = np.load('probs1.npy')

### GET TOP 15 REP DOCS OF A TOPIC
documents = pd.DataFrame({"Document": qdocs,
                          "ID": range(len(qdocs)),
                          "Topic": topic_model1.topics_})
# Change to get the top 15 representative documents
repr_docs, _, _, _ = topic_model1._extract_representative_docs(c_tf_idf=topic_model1.c_tf_idf_,
                                                               documents=documents,
                                                               topics=topic_model1.topic_representations_,
                                                               nr_repr_docs=15)

repdocsDF = pd.DataFrame()
for i in range(len(t_topics.columns) - 1):
    repindex = repr_docs[i]
    topic_df = pd.DataFrame({'topic' + str(i): repindex})
    repdocsDF = pd.concat([repdocsDF, topic_df], axis=1)

# repdocsDF.to_csv(r'top15_rep_sentences.csv', index=False, header=True)
repdocsDF.to_csv(r'MERGED_top15_rep_sentences.csv', index=False, header=True)

intertopicfig = topic_model1.visualize_topics()  # see intertopic distance map
# intertopicfig.write_html("intertopicfig.html")

hierarchical_topics = topic_model1.hierarchical_topics(qdocs)
hierarchicalfig = topic_model1.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
hierarchicalfig.write_html('hierarchicalfig.html')

# examine documents (sentences) of a specific topic
df_results = pd.DataFrame({"Doc": qdocs, "Topic": topics1,
                           "DocID": txts_df['doc_id'], "Date": txts_df['date']})
docscheck8 = df_results.loc[df_results['Topic'] == 8]
docscheck11 = df_results.loc[df_results['Topic'] == 11]


df_results.to_pickle('All_Sentences.pkl')
# Load the .pkl file
with open('All_Sentences.pkl', 'rb') as f:  # 'rb' stands for read-binary mode
    df_results = pickle.load(f)


df_results.to_csv(r'all_sentences.csv', index=False, header=True)


"""
MERGING TOPICS
"""
topics_to_merge = [[-1, 58, 63, 70, 72, 75, 79, 83, 88, 90, 96, 97],
                   [0, 33, 57, 85, 92, 102], [4, 18], [6, 36, 52, 71, 77],
                   [7, 64], [8, 11], [9, 23, 100], [10, 16, 26, 62], [12, 30, 101],
                   [13, 50, 54], [14, 17], [19, 28, 43, 65, 73], [20, 40, 80], [21, 89],
                   [22, 34, 55, 66, 68, 69, 76, 91], [25, 82], [27, 67],
                   [29, 81, 94], [31, 95], [35, 51, 53], [38, 60, 93], [41, 46],
                   [42, 56], [45, 86], [47, 87], [49, 78]]
topic_model1.merge_topics(qdocs, topics_to_merge)

t_topics = topic_model1.get_topics()
t_topics = pd.DataFrame.from_dict(t_topics)

t_topics.to_csv(r'Merged_TopicsV1.csv', index=False, header=True)

# update topics and probs after merging
topics1 = topic_model1.topics_
probs1 = topic_model1.probabilities_

# save the model, topics, and probs again after merge
topic_model1.save(path="MERGED BERTopic Model", serialization="safetensors", save_ctfidf=True)
# loading the BERTopic model
sent_model = SentenceTransformer("Embedding Model GTE BASE V15", trust_remote_code=True)
sent_model.max_seq_length = 8192
topic_model1 = BERTopic.load("MERGED BERTopic Model", embedding_model=sent_model)

# save topics
with open("MERGED_topics1.json", 'w') as f:
    # indent=2 not needed but makes the file human-readable if data is nested
    json.dump(topics1, f, indent=2)
# LOAD topics1 file
with open("MERGED_topics1.json", 'r') as f:
    topics1 = json.load(f)

# save probs
np.save('MERGED_probs1.npy', probs1)
# LOAD probs1 file
probs1 = np.load('MERGED_probs1.npy')

t_topics = topic_model1.get_topics()
t_topics = pd.DataFrame.from_dict(t_topics)

# topic about quantum risk assessments
docscheck29 = df_results.loc[df_results['Topic'] == 29]

# custom topic labels
topic_labels = topic_model1.generate_topic_labels(nr_words=3,
                                                  topic_prefix=True,
                                                  word_length=10,
                                                  separator=", ")
topictitle = ['Qubits', 'Public Key Encryption Algorithms', 'Quantum Sensors and Metrology',
              'Medical Drug Developments', 'Financial Services and Risk Management',
              'Classical Computer Limitations', 'Atomistic and Molecular Simulations', 'Logistics Optimizations',
              'Businesses Need to Prepare Early', 'Post Quantum Cryptography', 'Quantum Key Distributions',
              'Barriers to Adoption in Industry', 'Technical Uncertainties', 'New Tech Skills',
              'Investments and Funding', 'Quantum Machine Learning', 'Patent Analyses and Technology Cycles',
              'Renewable Energy', 'Quantum Applications Timeline Projections', 'Quantum Annealing',
              'Quantum Networks', 'Photons and Imaging', 'Cloud Quantum Computing Services',
              'New Materials to Lower Carbon Emissions', 'IBM and Google Quantum Developments',
              'Atomic Energy Modeling', 'Quantum Phase Calculations', 'Find Ecosystems and Partnerships',
              'Market Value Estimations', 'Quantum Security Risk Assessments', 'Identify Use Cases',
              'Quadratic Unconstrained Binary Optimization', 'Quantum Random Number Generators',
              'Consultant Work', 'Develop Benchmarks', 'AI and Quantum Winter', 'Trapped Ions',
              'Military Applications', 'Impacts on Tax', 'Noisy Intermediate Scale Quantum',
              'Hybrid Quantum Classical Approach', 'Ethical Risks', 'Magic State Factories']
topicnums = list(range(0, 43))
topic_model1.set_topic_labels(dict(zip(topicnums, topictitle)))

"""
CREATING WORDCLOUD VISUALIZATIONS
"""
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import textwrap


def create_wordcloud(model, topic, title, ax=None, wrap_length=20):
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=15)
    wc.generate_from_frequencies(text)

    # Wrap the title text
    wrapped_title = '\n'.join(textwrap.wrap(title, wrap_length))

    if ax is None:
        # Create a new figure if no axis is provided
        plt.figure(figsize=(10, 8), dpi=300)  # Added DPI=300 here
        plt.imshow(wc, interpolation="bilinear")
        plt.title(wrapped_title, fontsize=24, fontweight='bold')
        plt.axis("off")
        plt.show()
    else:
        # Use the provided axis
        ax.imshow(wc, interpolation="bilinear")
        ax.set_title(wrapped_title, fontsize=24, fontweight='bold')
        ax.axis("off")


# Create a figure with 6 subplots in a 2×3 grid
fig, axes = plt.subplots(2, 3, figsize=(24, 16), dpi=300)  # Added DPI=300 here
# Flatten the axes array for easier indexing
axes = axes.flatten()
# Define topics and titles
topics = [8, 11, 12, 17, 35, 41]
titles = [
    'Topic 8 Businesses Need to Prepare Early',
    'Topic 11 Barriers to Adoption in Industry',
    'Topic 12 Technical Uncertainties & Limitations',
    'Topic 17 Renewable Energy',
    'Topic 35 AI and Quantum Winter',
    'Topic 41 Ethical Risks'
]
# Generate all wordclouds
for i, (topic, title) in enumerate(zip(topics, titles)):
    create_wordcloud(topic_model1, topic=topic, title=title, ax=axes[i], wrap_length=31)

# Add more space for the wrapped titles
plt.tight_layout(pad=3.0)
# If you want to save the high-resolution figure
#plt.savefig('wordcloud_topics.png', dpi=300)  # Optional - to save the figure
plt.show()


# critical/skeptical topics
docscheck11 = df_results.loc[df_results['Topic'] == 11]
docscheck12 = df_results.loc[df_results['Topic'] == 12]
docscheck35 = df_results.loc[df_results['Topic'] == 35]
docscheck41 = df_results.loc[df_results['Topic'] == 41]
cautioningtopics = pd.concat([docscheck11, docscheck12, docscheck35, docscheck41])
cautioningtopics['Doc'].nunique()  # 1063 unique sentences
cautioningtopics['DocID'].nunique()  # 186 unique docs

# critical/skeptical topics
docscheck8 = df_results.loc[df_results['Topic'] == 8]
docscheck27 = df_results.loc[df_results['Topic'] == 27]
docscheck29 = df_results.loc[df_results['Topic'] == 29]
docscheck34 = df_results.loc[df_results['Topic'] == 34]


from typing import List, Union
import plotly.graph_objects as go
from sklearn.preprocessing import normalize


def visualize_topics_over_time(topic_model,
                               topics_over_time: pd.DataFrame,
                               top_n_topics: int = None,
                               topics: List[int] = None,
                               normalize_frequency: bool = False,
                               custom_labels: Union[bool, str] = False,
                               title: str = "<b>Topics over Time</b>",
                               width: int = 1250,
                               height: int = 450) -> go.Figure:
    # ADDED CUSTOM COLORS HERE
    colors = px.colors.qualitative.Light24
    # colors = px.colors.qualitative.Plotly

    # Select topics based on top_n and topics args
    freq_df = topic_model.get_topic_freq()
    freq_df = freq_df.loc[freq_df.Topic != -1, :]
    if topics is not None:
        selected_topics = list(topics)
    elif top_n_topics is not None:
        selected_topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
    else:
        selected_topics = sorted(freq_df.Topic.to_list())

    # Prepare data
    if isinstance(custom_labels, str):
        topic_names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics]
        topic_names = ["_".join([label[0] for label in labels[:4]]) for labels in topic_names]
        topic_names = [label if len(label) < 30 else label[:27] + "..." for label in topic_names]
        topic_names = {key: topic_names[index] for index, key in enumerate(topic_model.topic_labels_.keys())}
    elif topic_model.custom_labels_ is not None and custom_labels:
        topic_names = {key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in
                       topic_model.topic_labels_.items()}
    else:
        topic_names = {key: value[:40] + "..." if len(value) > 40 else value
                       for key, value in topic_model.topic_labels_.items()}
    topics_over_time["Name"] = topics_over_time.Topic.map(topic_names)
    data = topics_over_time.loc[topics_over_time.Topic.isin(selected_topics), :].sort_values(["Topic", "Timestamp"])

    # Add traces
    fig = go.Figure()
    for index, topic in enumerate(data.Topic.unique()):
        trace_data = data.loc[data.Topic == topic, :]
        topic_name = trace_data.Name.values[0]
        words = trace_data.Words.values
        if normalize_frequency:
            y = normalize(trace_data.Frequency.values.reshape(1, -1))[0]
        else:
            y = trace_data.Frequency
        fig.add_trace(go.Scatter(x=trace_data.Timestamp, y=y,
                                 mode='lines',
                                 marker_color=colors[index],
                                 hoverinfo="text",
                                 name=topic_name,
                                 hovertext=[f'<b>Topic {topic}</b><br>Words: {word}' for word in words]))

    # Styling of the visualization
    fig.update_xaxes(showgrid=True)
    fig.update_yaxes(showgrid=True)
    fig.update_layout(
        yaxis_title="Normalized Frequency" if normalize_frequency else "Frequency",
        title={
            'text': f"{title}",
            'y': .95,
            'x': 0.40,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        template="simple_white",
        width=width,
        height=height,
        hoverlabel=dict(
            bgcolor="white",
            font_size=16,
            font_family="Rockwell"
        ),
        legend=dict(
            title="<b>Global Topic Representation",
        )
    )
    return fig


topics_over_time = topic_model1.topics_over_time(qdocs, timestamps, nr_bins=65)

fig1 = visualize_topics_over_time(topic_model1, topics_over_time, topics=[13, 14, 16, 24, 28, 33],
                                  title='<b>Company and Market Development Topics over Time</b>',
                                  custom_labels=True)
fig2 = visualize_topics_over_time(topic_model1, topics_over_time, topics=[8, 11, 12, 18, 27, 29, 30, 34, 35, 41],
                                  title='<b>Rhetoric Topics over Time</b>',
                                  custom_labels=True)
fig31 = visualize_topics_over_time(topic_model1, topics_over_time, topics=[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 15, 17, 19],
                                   title='<b>Technical Aspects and Applications Topics over Time</b>',
                                   custom_labels=True, height=500)
fig32 = visualize_topics_over_time(topic_model1, topics_over_time, topics=[20, 21, 22, 23, 25, 26, 31, 32, 36, 37, 38,
                                                                           39, 40, 42],
                                   title='<b>Technical Aspects and Applications Topics over Time</b>',
                                   custom_labels=True, height=500)

fig1.write_image("high_res_figure.png", scale=3)

"""
SENTIMENT ANALYSIS OF TOPICS
"""
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
# Labels: 0 = Neg, 1 = Neutral, 2 = Pos
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

starttime = time.time()
repdocs_df = pd.DataFrame(columns=['TopicNumber', 'Topic', 'RepSentence',
                                   'Senti_Negative', 'Senti_Neutral', 'Senti_Positive'])
for n in topicnums:
    repdoc = repr_docs[n]
    for doc in repdoc:
        encodedinput = tokenizer(doc, return_tensors='pt')
        modeloutput = model(**encodedinput)
        sentiscore = np.round(softmax(modeloutput[0][0].detach().numpy()), 4)
        listrow = [n, topictitle[n], doc, sentiscore[0], sentiscore[1], sentiscore[2]]
        repdocs_df.loc[len(repdocs_df)] = listrow
endtime = time.time()
print("Time elapsed: {}".format(endtime - starttime))  # 48 sec

repdocs_df.to_csv(r'Sentiment_by_sentence.csv', index=False, header=True)

# get mean sentiments grouped by each topic num
AvgSenti = repdocs_df.groupby('TopicNumber')[['Senti_Positive', 'Senti_Neutral', 'Senti_Negative']].mean()
# AvgSenti = repdocs_df.groupby(['TopicNumber','Topic'])[['Senti_Negative','Senti_Neutral','Senti_Positive']].mean()
AvgSenti['Topic'] = topictitle

# company/market development themed topics
senticompanydev = AvgSenti.iloc[[13, 14, 16, 24, 28, 33],]
orders = np.argsort(senticompanydev.Senti_Positive)
np.round(senticompanydev[['Senti_Positive', 'Senti_Neutral', 'Senti_Negative']].mean(), 4)

fig = plt.figure(figsize=(10, 5))
senticompanydev.iloc[orders].plot.barh(x='Topic', stacked=True, ax=plt.gca(),
                                       color=['limegreen', 'lightgray', 'xkcd:blood orange'])
plt.legend(['Positive', 'Neutral', 'Negative'], title='Sentiment', loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Sentiment Analysis on Company and Market Development Topics')

# rhetoric themed topics
sentirhetoric = AvgSenti.iloc[[8, 11, 12, 18, 27, 29, 30, 34, 35, 41],]
orders = np.argsort(sentirhetoric.Senti_Positive)
np.round(sentirhetoric[['Senti_Positive', 'Senti_Neutral', 'Senti_Negative']].mean(), 4)

fig = plt.figure(figsize=(10, 5))
sentirhetoric.iloc[orders].plot.barh(x='Topic', stacked=True, ax=plt.gca(),
                                     color=['limegreen', 'lightgray', 'xkcd:blood orange'])
plt.legend(['Positive', 'Neutral', 'Negative'], title='Sentiment', loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Sentiment Analysis on Rhetoric Topics')

# technical and applications themed topics
sentitechnical = AvgSenti.iloc[[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 15, 17, 19, 20, 21, 22, 23, 25, 26, 31, 32, 36, 37, 38,
                                39, 40, 42],]
orders = np.argsort(sentitechnical.Senti_Positive)
np.round(sentitechnical[['Senti_Positive', 'Senti_Neutral', 'Senti_Negative']].mean(), 4)

fig = plt.figure(figsize=(10, 5))
sentitechnical.iloc[orders].plot.barh(x='Topic', stacked=True, ax=plt.gca(),
                                      color=['limegreen', 'lightgray', 'xkcd:blood orange'])
plt.legend(['Positive', 'Neutral', 'Negative'], title='Sentiment', loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Sentiment Analysis on Technical Aspects and Applications Topics')

senticompanydev2 = senticompanydev.iloc[:, :-1]  # remove last column which is just Topic name
senticompanydev2 = senticompanydev2.mean()

sentirhetoric2 = sentirhetoric.iloc[:, :-1]
sentirhetoric2 = sentirhetoric2.mean()

sentitechnical2 = sentitechnical.iloc[:, :-1]
sentitechnical2 = sentitechnical2.mean()

AvgSentiByTheme = pd.concat([senticompanydev2, sentirhetoric2, sentitechnical2], axis=1)
AvgSentiByTheme = AvgSentiByTheme.T
themes1 = ['Company and Market Development', 'Rhetoric', 'Technical Aspects and Applications']
AvgSentiByTheme['Category'] = themes1

orders = np.argsort(AvgSentiByTheme.Senti_Positive)
np.round(AvgSentiByTheme[['Senti_Positive', 'Senti_Neutral', 'Senti_Negative']].mean(), 4)
fig = plt.figure(figsize=(10, 5), dpi=300)
AvgSentiByTheme.iloc[orders].plot.barh(x='Category', stacked=True, ax=plt.gca(),
                                       color=['limegreen', 'lightgray', 'xkcd:blood orange'])
plt.legend(['Positive', 'Neutral', 'Negative'], title='Sentiment', loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Sentiment Analysis Average Scores by Category')

"""
SENTIMENT ANALYSIS OF ALL DOCUMENTS (SENTENCES)
"""


def getSentiScore(doc):
    encodedinput = tokenizer(doc, return_tensors='pt')
    modeloutput = model(**encodedinput)
    sentiscore = np.round(softmax(modeloutput[0][0].detach().numpy()), 4)
    scorerows = [sentiscore[0], sentiscore[1], sentiscore[2]]
    return scorerows


alldocs_senti = txts_df
sentiscores = alldocs_senti.apply(lambda x: getSentiScore(x['token']), axis=1)

sentiscores2 = pd.DataFrame(sentiscores.to_list(),
                            columns=['Senti_Negative', 'Senti_Neutral', 'Senti_Positive'])

sentiscores2.to_pickle('AllSentiScores.pkl')
alldocs_senti.to_pickle('All_Docs_Sentiment_Scores.pkl')  # this contains every sentence and sentiment scores
# sentiscores2 = pd.read_pickle('AllSentiScores.pkl')

alldocs_senti = pd.concat([alldocs_senti, sentiscores2], axis=1)
alldocs_senti = alldocs_senti.drop(['filename'], axis=1)  # drop filename column

alldocs_senti2 = alldocs_senti.groupby('date')[['Senti_Negative', 'Senti_Neutral', 'Senti_Positive']].mean()

plt.figure(figsize=(10, 5), dpi=300)
plt.plot(alldocs_senti2.index, alldocs_senti2['Senti_Positive'], color='limegreen')
plt.plot(alldocs_senti2.index, alldocs_senti2['Senti_Neutral'], color='lightgray')
plt.plot(alldocs_senti2.index, alldocs_senti2['Senti_Negative'], color='xkcd:blood orange')
plt.xlabel('Time')
plt.ylabel('Polarity Score')
plt.legend(['Positive', 'Neutral', 'Negative'], title='Sentiment', loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Sentiment Analysis Average Scores of All Documents over Time')
plt.xlim(pd.Timestamp('2017-01-01'), pd.Timestamp('2025-01-31'))
#plt.xlim(pd.Timestamp('2005-01-01'), pd.Timestamp('2025-01-31'))


# Load the .pkl file
with open('All_Docs_Sentiment_Scores.pkl', 'rb') as f:  # 'rb' stands for read-binary mode
    alldocs_senti = pickle.load(f)



"""
EMOTION ANALYSIS OF TOPICS
"""
from transformers import pipeline

# contains 28 emotion labels
classifier = pipeline("text-classification",
                      model="SamLowe/roberta-base-go_emotions",
                      return_all_scores=True)

starttime = time.time()
repdocs_emo_df = pd.DataFrame(columns=['TopicNumber', 'Topic', 'RepSentence', 'admiration', 'amusement', 'anger',
                                       'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire',
                                       'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement',
                                       'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride',
                                       'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'])
for n in topicnums:
    repdoc = repr_docs[n]
    for doc in repdoc:
        emoclassifier = pd.DataFrame.from_records(classifier(doc)[0])
        emoscorelist = emoclassifier.score.tolist()
        emoscorelist = list(np.around(np.array(emoscorelist), 4))
        list1 = [n, topictitle[n], doc]
        repdocs_emo_df.loc[len(repdocs_emo_df)] = list1 + emoscorelist
endtime = time.time()
print("Time elapsed: {}".format(endtime - starttime))  # 52 sec

# get mean emotions grouped by each topic num
AvgEmotions = repdocs_emo_df.groupby('TopicNumber')[['admiration', 'amusement', 'anger', 'annoyance',
                                                     'approval', 'caring', 'confusion', 'curiosity',
                                                     'desire', 'disappointment', 'disapproval', 'disgust',
                                                     'embarrassment', 'excitement', 'fear',
                                                     'gratitude', 'grief', 'joy', 'love', 'nervousness',
                                                     'optimism', 'pride', 'realization', 'relief',
                                                     'remorse', 'sadness', 'surprise', 'neutral']].mean()
# get 5 largest emotion scores from each topic
threshold = len(AvgEmotions.columns) - 5
AvgEmotions2 = AvgEmotions[AvgEmotions.rank(method='dense', axis=1) > threshold]
AvgEmotions2 = AvgEmotions2.round(4)
# drop columns that have NaNs for all rows
for column in AvgEmotions2:
    if AvgEmotions2[column].isnull().all():
        AvgEmotions2.drop(column, axis=1, inplace=True)

emotioncols = list(AvgEmotions2.columns)
customcolors = ['#a6cee3', '#ff7f00', '#1f91b4', '#b2df8a', '#fdbf6f', '#fccde5',
                '#6a3d9a', '#b15928', '#e31a1c', '#cab2d6', '#45ba3d', '#ffff99',
                '#d9d9d9']
colandcolors = dict(zip(emotioncols, customcolors))

AvgEmotions2['Topic'] = topictitle

# company market development themed topics
emocompanydev = AvgEmotions2.iloc[[13, 14, 16, 24, 28, 33],]

for column in emocompanydev:
    if emocompanydev[column].isnull().all():
        emocompanydev.drop(column, axis=1, inplace=True)

topicnamecol = emocompanydev.Topic
# normalize the data b/c these emotion scores don't add up to 1 row-wise
# maybe due to averaging or rounding or the transformer model calculations
# either way it doesn't make graph look good and bit harder to analyze
emocompanydev = emocompanydev.loc[:, emocompanydev.columns != 'Topic'].div(
    emocompanydev.loc[:, emocompanydev.columns != 'Topic'].sum(axis=1), axis=0)
emocompanydev['Topic'] = topicnamecol

orders = np.argsort(emocompanydev.neutral)
fig = plt.figure(figsize=(10, 5))
emocompanydev.iloc[orders].plot.barh(x='Topic', stacked=True, ax=plt.gca(),
                                     color=[colandcolors.get(x, '#333333') for x in emocompanydev.columns])
plt.xlabel('Score')
plt.title('Top 5 Emotions by Topic in Company and Market Development Category')
plt.legend(title='Emotion', loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()


# rhetoric themed topics
emorhetoric = AvgEmotions2.iloc[[8, 11, 12, 18, 27, 29, 30, 34, 35, 41],]
for column in emorhetoric:
    if emorhetoric[column].isnull().all():
        emorhetoric.drop(column, axis=1, inplace=True)

topicnamecol = emorhetoric.Topic
emorhetoric = emorhetoric.loc[:,emorhetoric.columns!='Topic'].div(emorhetoric.loc[:,emorhetoric.columns!='Topic'].sum(axis=1), axis=0)
emorhetoric['Topic'] = topicnamecol

orders = np.argsort(emorhetoric.neutral)
fig = plt.figure(figsize=(10, 5))
emorhetoric.iloc[orders].plot.barh(x='Topic', stacked=True, ax=plt.gca(),
                      color=[colandcolors.get(x,'#333333') for x in emorhetoric.columns])
plt.xlabel('Score')
plt.title('Top 5 Emotions by Topic in Rhetoric Category')
plt.legend(title='Emotion',loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()


# technical aspects and applications themed topics
emotechnical = AvgEmotions2.iloc[[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 15, 17, 19, 20, 21, 22, 23, 25, 26, 31, 32, 36, 37, 38,
                                 39, 40, 42],]
for column in emotechnical:
    if emotechnical[column].isnull().all():
        emotechnical.drop(column, axis=1, inplace=True)

topicnamecol = emotechnical.Topic
emotechnical = emotechnical.loc[:,emotechnical.columns!='Topic'].div(emotechnical.loc[:,emotechnical.columns!='Topic'].sum(axis=1), axis=0)
emotechnical['Topic'] = topicnamecol

orders = np.argsort(emotechnical.neutral)
fig = plt.figure(figsize=(10, 5))
emotechnical.iloc[orders].plot.barh(x='Topic',stacked=True,ax=plt.gca(),
                       color=[colandcolors.get(x,'#333333') for x in emotechnical.columns])
plt.xlabel('Score')
plt.title('Top 5 Emotions by Topic in Technical Aspects and Applications Category')
plt.legend(title='Emotion',loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()


emocompanydev2 = emocompanydev.iloc[:, :-1]  # remove last column which is just Topic name
emocompanydev2 = emocompanydev2.fillna(0)   # fill NaNs with 0s
emocompanydev2 = emocompanydev2.mean()
emocompanydev2 = emocompanydev2.nlargest()  # keep 5 largest emotions

emorhetoric2 = emorhetoric.iloc[:, :-1]
emorhetoric2 = emorhetoric2.fillna(0)
emorhetoric2 = emorhetoric2.mean()
emorhetoric2 = emorhetoric2.nlargest()

emotechnical2 = emotechnical.iloc[:, :-1]
emotechnical2 = emotechnical2.fillna(0)
emotechnical2 = emotechnical2.mean()
emotechnical2 = emotechnical2.nlargest()

AvgEmoByTheme = pd.concat([emocompanydev2, emorhetoric2, emotechnical2], axis=1)
AvgEmoByTheme = AvgEmoByTheme.T
AvgEmoByTheme = AvgEmoByTheme.div(AvgEmoByTheme.sum(axis=1), axis=0)
themes1 = ['Company and Market Development', 'Rhetoric', 'Technical Aspects and Applications']
AvgEmoByTheme['Category'] = themes1
AvgEmoByTheme = AvgEmoByTheme.fillna(0)

# rearrange neutral column to put at second to last column
neutral_col = AvgEmoByTheme.pop('neutral')
AvgEmoByTheme.insert(5, 'neutral',neutral_col) # place at second to last column

orders = np.argsort(AvgEmoByTheme.neutral)
fig = plt.figure(figsize=(10, 5), dpi=300)
AvgEmoByTheme.iloc[orders].plot.barh(x='Category',
                                     stacked=True,ax=plt.gca(),
                                     color=[colandcolors.get(x, '#333333') for x in AvgEmoByTheme.columns])
plt.xlabel('Score')
plt.legend(title='Emotion', loc='center left', bbox_to_anchor=(1, 0.5))
# plt.subplots_adjust(bottom=.25, left=.25)
plt.title('Top 5 Emotions by Category')

"""
EMOTION ANALYSIS OF ALL DOCUMENTS (SENTENCES)
"""

def getEmoScore(doc):
    emoclassifier = pd.DataFrame.from_records(classifier(doc)[0])
    emoscorelist = emoclassifier.score.tolist()
    emoscorelist = list(np.around(np.array(emoscorelist),4))
    return emoscorelist

alldocs_emo = txts_df

starttime = time.time()
emoscores = alldocs_emo.apply(lambda x: getEmoScore(x['token']),axis=1)
endtime = time.time()
print("Time elapsed: {}".format(endtime - starttime))  # about 27 min

emoscores2 = pd.DataFrame(emoscores.to_list(), columns=['admiration',
                                   'amusement','anger','annoyance',
                                   'approval','caring','confusion','curiosity',
                                   'desire','disappointment', 'disapproval','disgust',
                                   'embarrassment','excitement','fear',
                                   'gratitude','grief','joy','love','nervousness',
                                   'optimism','pride','realization','relief',
                                   'remorse','sadness','surprise','neutral'])
emoscores2.to_pickle('AllEmoScores.pkl')
#emoscores2 = pd.read_pickle('AllEmoScores.pkl')

emomeantest = emoscores2.mean()
# keeping top 10 emotions by average first
emocols = emomeantest.sort_values(ascending=False).head(n=10).index.tolist()
emoscores2 = emoscores2[emoscores2.columns[emoscores2.columns.isin(emocols)]]

alldocs_emo = pd.concat([alldocs_emo, emoscores2], axis=1)
alldocs_emo = alldocs_emo.drop(['filename'], axis=1) # drop filename column
alldocs_emo2 = alldocs_emo.groupby('date')[emocols].mean()

plt.figure(figsize=(10, 5), dpi=300)
plt.plot(alldocs_emo2.index, alldocs_emo2['admiration'], color='#a6cee3')
plt.plot(alldocs_emo2.index, alldocs_emo2['approval'], color='#1f91b4')
plt.plot(alldocs_emo2.index, alldocs_emo2['confusion'], color='#fdbf6f')
plt.plot(alldocs_emo2.index, alldocs_emo2['curiosity'], color='#fccde5')
plt.plot(alldocs_emo2.index, alldocs_emo2['desire'], color='#6a3d9a')
plt.plot(alldocs_emo2.index, alldocs_emo2['disappointment'], color='#b15928')
plt.plot(alldocs_emo2.index, alldocs_emo2['disapproval'], color='#e31a1c')
plt.plot(alldocs_emo2.index, alldocs_emo2['neutral'], color='#d9d9d9')
plt.plot(alldocs_emo2.index, alldocs_emo2['optimism'], color='#45ba3d')
plt.plot(alldocs_emo2.index, alldocs_emo2['realization'], color='#ffff99')

plt.xlabel('Time')
plt.ylabel('Score')
plt.legend(sorted(emocols),title='Emotion',loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Top 10 Emotions Average Scores of All Documents over Time')
plt.xlim(pd.Timestamp('2017-01-01'), pd.Timestamp('2025-01-31'))
#plt.xlim(pd.Timestamp('2005-01-01'), pd.Timestamp('2025-01-31'))




# topic tuning for HDBSCAN parameters
from topictuner import TopicModelTuner as TMT

topic_modeltest = BERTopic(language='english', umap_model=umap_model, embedding_model=sent_model,
                           vectorizer_model=vectorizer_model)
tmt = TMT.wrapBERTopicModel(topic_modeltest)
tmt.createEmbeddings(qdocs)
tmt.reduce()

lastRunResultsDF = tmt.randomSearch([*range(25, 121)], [.1, .2, .4, .6, .8, 1], iters=60)
tmt.visualizeSearch(lastRunResultsDF).show()

starttime = time.time()
grid1 = tmt.pseudoGridSearch([*range(40, 71)], [x / 100 for x in range(5, 41, 2)])
endtime = time.time()
print("Time elapsed: {}".format(endtime - starttime))  #621 sec, about 10 min
tmt.visualizeSearch(grid1).show()


grid1.to_pickle('gridsearch1.pkl')
grid1 = pd.read_pickle('gridsearch1.pkl')

starttime = time.time()
minclust = list(range(40, 66))
minsamp = list(range(6, 15))
for i in minclust:
    for j in minsamp:
        clusterer = hdbscan.HDBSCAN(min_cluster_size=i, min_samples=j, metric='euclidean', prediction_data=True)
        tmodel = BERTopic(language='english', umap_model=umap_model, embedding_model=sent_model,
                          hdbscan_model=clusterer, vectorizer_model=vectorizer_model, top_n_words=15,
                          calculate_probabilities=True)
        t1, p1 = tmodel.fit_transform(qdocs)
        t2 = tmodel.get_topics()
        t2 = pd.DataFrame.from_dict(t2)
        t2.to_csv(f't_topics{i}-{j}.csv', index=False, header=True)
endtime = time.time()
print("Time elapsed: {}".format(endtime - starttime))  # 5.5 hrs
